# Load Data
weather <- read.csv("enchantWeather.csv")
lotto <- read.csv("enchantLotto.csv")
# Rename Day.of.Year column on lottery results set
lotto <- rename(lotto, Day = Day.of.Year)
# Merge data frames on Day column
df = merge(x = lotto, y = weather, by = "Day")
df$Preferred.Entry.Date <- as.Date(df$Preferred.Entry.Date, "%m/%d/%Y") #Convert to date type
df$Result <- as.factor(df$Result) #Convert result to factor
df$Preference.Order <- as.factor(df$Preference.Order) #Convert Preference.Order to factor
df$Preferred.Division <- as.factor(df$Preferred.Division) #Convert Preferred.Division to factor
df$ï..Table.Names <- NULL #Drop table name column
df$resultBin <- ifelse(df$Result == "Accepted", 1, 0) # Create dummy variable for Result
df$nwTrekConditions <- as.integer(ifelse(df$avgPrecipIn < 0.03 & df$avgTempMean > 52, 1, 0)) # Variable to indicate if both weather conditions are met
df$Week <- strftime(df$Preferred.Entry.Date, format = "%V") # Create Week Variable
df$Week <- as.integer(df$Week) #Convert Week variable to integer instead of character
# Data summary
summary(df)
## Day Result Preference.Order Preferred.Entry.Date
## Min. :134.0 Accepted : 2445 First :36695 Min. :2021-05-15
## 1st Qu.:201.0 Unsuccessful:106197 Second:36273 1st Qu.:2021-07-21
## Median :222.0 Third :35674 Median :2021-08-11
## Mean :220.6 Mean :2021-08-09
## 3rd Qu.:243.0 3rd Qu.:2021-09-01
## Max. :303.0 Max. :2021-10-31
##
## Preferred.Division Minimum.Acceptable.Group.Size
## Core Enchantment Zone :70433 Min. :1.000
## Colchuck Zone :16493 1st Qu.:4.000
## Snow Zone :12481 Median :4.000
## Stuart Zone : 6346 Mean :4.985
## Eightmile/Caroline Zone: 2425 3rd Qu.:6.000
## Stuart Zone (stock) : 199 Max. :8.000
## (Other) : 265
## Maximum.Requested.Group.Size avgPrecipIn avgTempMax
## Min. :1.000 Min. :0.002683 Min. :40.29
## 1st Qu.:4.000 1st Qu.:0.011951 1st Qu.:63.73
## Median :4.000 Median :0.021220 Median :67.95
## Mean :4.985 Mean :0.027856 Mean :66.85
## 3rd Qu.:6.000 3rd Qu.:0.037073 3rd Qu.:71.17
## Max. :8.000 Max. :0.281463 Max. :74.11
##
## avgTempMean avgTempMin resultBin nwTrekConditions
## Min. :33.40 Min. :26.53 Min. :0.00000 Min. :0.00
## 1st Qu.:53.00 1st Qu.:42.18 1st Qu.:0.00000 1st Qu.:0.00
## Median :56.60 Median :44.79 Median :0.00000 Median :1.00
## Mean :55.53 Mean :44.21 Mean :0.02251 Mean :0.61
## 3rd Qu.:59.40 3rd Qu.:47.53 3rd Qu.:0.00000 3rd Qu.:1.00
## Max. :61.72 Max. :49.53 Max. :1.00000 Max. :1.00
##
## Week
## Min. :19.00
## 1st Qu.:29.00
## Median :32.00
## Mean :31.68
## 3rd Qu.:35.00
## Max. :43.00
##
# Data structure
str(df)
## 'data.frame': 108642 obs. of 14 variables:
## $ Day : num 134 134 134 134 134 134 134 134 134 134 ...
## $ Result : Factor w/ 2 levels "Accepted","Unsuccessful": 2 2 2 1 2 2 2 2 2 2 ...
## $ Preference.Order : Factor w/ 3 levels "First","Second",..: 1 1 1 1 3 2 2 1 1 2 ...
## $ Preferred.Entry.Date : Date, format: "2021-05-15" "2021-05-15" ...
## $ Preferred.Division : Factor w/ 8 levels "Colchuck Zone",..: 2 6 1 7 1 3 2 3 2 2 ...
## $ Minimum.Acceptable.Group.Size: int 3 5 2 6 8 6 3 2 5 2 ...
## $ Maximum.Requested.Group.Size : int 3 5 2 6 8 6 3 2 5 2 ...
## $ avgPrecipIn : num 0.107 0.107 0.107 0.107 0.107 ...
## $ avgTempMax : num 52.8 52.8 52.8 52.8 52.8 ...
## $ avgTempMean : num 42.7 42.7 42.7 42.7 42.7 ...
## $ avgTempMin : num 32.5 32.5 32.5 32.5 32.5 ...
## $ resultBin : num 0 0 0 1 0 0 0 0 0 0 ...
## $ nwTrekConditions : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Week : int 19 19 19 19 19 19 19 19 19 19 ...
# First 5 entries
head(df)
## Day Result Preference.Order Preferred.Entry.Date
## 1 134 Unsuccessful First 2021-05-15
## 2 134 Unsuccessful First 2021-05-15
## 3 134 Unsuccessful First 2021-05-15
## 4 134 Accepted First 2021-05-15
## 5 134 Unsuccessful Third 2021-05-15
## 6 134 Unsuccessful Second 2021-05-15
## Preferred.Division Minimum.Acceptable.Group.Size
## 1 Core Enchantment Zone 3
## 2 Snow Zone 5
## 3 Colchuck Zone 2
## 4 Stuart Zone 6
## 5 Colchuck Zone 8
## 6 Eightmile/Caroline Zone 6
## Maximum.Requested.Group.Size avgPrecipIn avgTempMax avgTempMean avgTempMin
## 1 3 0.1070732 52.82195 42.68537 32.52927
## 2 5 0.1070732 52.82195 42.68537 32.52927
## 3 2 0.1070732 52.82195 42.68537 32.52927
## 4 6 0.1070732 52.82195 42.68537 32.52927
## 5 8 0.1070732 52.82195 42.68537 32.52927
## 6 6 0.1070732 52.82195 42.68537 32.52927
## resultBin nwTrekConditions Week
## 1 0 0 19
## 2 0 0 19
## 3 0 0 19
## 4 1 0 19
## 5 0 0 19
## 6 0 0 19
barBox <- function(vbl) {
grid.arrange(
ggplot(data = df, mapping = aes(x = {{vbl}})) +
geom_bar() +
theme_minimal(),
ggplot(data = df, mapping = aes(x = 0)) +
geom_boxplot(mapping = aes(y = {{vbl}})) +
coord_flip() +
theme_minimal()
)
}
densityBox <- function(vbl) {
grid.arrange(
ggplot(data = df, mapping = aes(x = {{vbl}})) +
geom_density() +
theme_minimal(),
ggplot(data = df, mapping = aes(x = 0)) +
geom_boxplot(mapping = aes(y = {{vbl}})) +
coord_flip() +
theme_minimal()
)
}
barBox(Minimum.Acceptable.Group.Size)
barBox(Maximum.Requested.Group.Size)
densityBox(Day)
densityBox(Week)
densityBox(avgPrecipIn)
densityBox(avgTempMax)
densityBox(avgTempMean)
densityBox(avgTempMin)
df %>%
ggplot(aes(x = Result)) +
geom_bar() +
theme_minimal()
precipDate <- df %>%
filter(Preferred.Division == "Core Enchantment Zone") %>%
group_by(Day) %>%
summarise(averagePrecipitation = mean(avgPrecipIn)) %>%
ggplot(aes(x = Day, y = averagePrecipitation, color = averagePrecipitation < 0.03)) +
geom_point() +
geom_vline(xintercept = c(182, 248), linetype='dashed') +
geom_hline(yintercept = 0.03, linetype="dashed") +
theme_classic() +
labs(title = "Average Precipitation Over Time",
subtitle = "Average precipitation bottoms out in the summer months",
x = "Day",
y = "Average Precipitation",
color = "Precipitation < 0.03 in")
precipDate
ggplotly(precipDate)
facetedPrecip <- df %>%
group_by(Day, Preferred.Division, Result) %>%
summarise(averagePrecipitation = mean(avgPrecipIn)) %>%
ggplot(aes(x = Day, y = averagePrecipitation, color = Result)) +
geom_point(alpha=0.45, position = "jitter") +
facet_wrap(~ Preferred.Division) +
labs(title = "Average Precipitation by Zone",
subtitle = "Precipitation follows a similar pattern for all zones",
x = "Day of Year",
y = "Average Precipitation (inches)") +
theme_minimal()
## `summarise()` has grouped output by 'Day', 'Preferred.Division'. You can override using the `.groups` argument.
facetedPrecip
ggplotly(facetedPrecip)
tempDate <- df %>%
filter(Preferred.Division == "Core Enchantment Zone") %>%
group_by(Day) %>%
summarise(averageTemperature = mean(avgTempMean)) %>%
ggplot(aes(x = Day, y = averageTemperature, color = averageTemperature > 52)) +
geom_point() +
geom_hline(yintercept = 52, linetype="dashed") +
theme_classic() +
labs(title = "Average Temperature Over Time (Core Zone)",
subtitle = "Average temperature tops out in the summer months",
x = "Day",
y = "Average Temperature",
color = "Temperature > 52F")
tempDate
ggplotly(tempDate)
facetedTemp <- df %>%
group_by(Day, Preferred.Division, Result) %>%
summarise(averageTemp = mean(avgTempMean)) %>%
ggplot(aes(x = Day, y = averageTemp, color = Result)) +
geom_point(alpha=0.45, position = "jitter") +
facet_wrap(~ Preferred.Division) +
labs(title = "Average Temperature by Zone",
subtitle = "Temperature follows a similar pattern for all zones",
x = "Day of Year",
y = "Average Temperature (F)") +
theme_minimal()
## `summarise()` has grouped output by 'Day', 'Preferred.Division'. You can override using the `.groups` argument.
facetedTemp
ggplotly(facetedTemp)
# Create separate data frame containing ratios of accepted to unsuccessful
resultRatio <- df %>%
group_by(Preferred.Division, Result) %>%
summarize(N = n()) %>%
mutate(Ratio = round(N / sum(N), 2))
## `summarise()` has grouped output by 'Preferred.Division'. You can override using the `.groups` argument.
# Returns flipped column chart with top value highlighted
avgAcceptance <- resultRatio %>%
filter(Result == "Accepted") %>%
group_by(Preferred.Division) %>%
summarise(avgAcceptance = mean(Ratio)) %>% # Doesn't really do anything, as there's only one ratio for each group
ggplot(mapping = aes(fct_reorder(Preferred.Division, avgAcceptance), y = avgAcceptance*100)) +
geom_col() +
coord_flip() +
gghighlight(Preferred.Division == "Core Enchantment Zone") +
labs(title = "Acceptance Rate by Division",
subtitle = "Core Enchantment Zone had the lowest acceptance rate (about 1%)",
x = "Preferred Division",
y = "Average Acceptance (%)") +
theme_minimal()
avgAcceptance
ggplotly(avgAcceptance)
df %>%
ggplot(aes(x = Day, y = nwTrekConditions)) +
geom_point() +
theme_minimal()
dfSpec <- df[,c('Result', 'Day', 'Preferred.Division', 'Maximum.Requested.Group.Size')]
set.seed(777)
#Use 70% of dataset as training set and remaining 30% as testing set
sample <- sample(c(TRUE, FALSE), nrow(dfSpec), replace=TRUE, prob=c(0.7,0.3))
train <- dfSpec[sample, ]
test <- dfSpec[!sample, ]
# Retrieve counts of accepted and unsuccessful applications
table(train$Result)
##
## Accepted Unsuccessful
## 1722 74363
# Over sampling
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.1.2
## Loaded ROSE 0.0-4
df_balanced_over <- ovun.sample(Result ~ Maximum.Requested.Group.Size + Day + Preferred.Division, data = train, method = "over", N=148726)$data
table(df_balanced_over$Result)
##
## Unsuccessful Accepted
## 74363 74363
# Under sampling
df_balanced_under <- ovun.sample(Result ~ Maximum.Requested.Group.Size + Day + Preferred.Division, data = train, method = "under", N=3444)$data
table(df_balanced_under$Result)
##
## Unsuccessful Accepted
## 1722 1722
# Using both over and under sampling
df_balanced_both <- ovun.sample(Result ~ Maximum.Requested.Group.Size + Day + Preferred.Division, data = train, method = "both", p=0.5, N=76085, seed = 1)$data
table(df_balanced_both$Result)
##
## Unsuccessful Accepted
## 37910 38175
# Synthetic data generation
df.rose <- ROSE(Result ~ Maximum.Requested.Group.Size + Day + Preferred.Division, data = train, seed=1)$data
table(df.rose$Result)
##
## Unsuccessful Accepted
## 37910 38175
tree.rose <- rpart(Result ~ .,
data = df.rose,
method = "class")
tree.over <- rpart(Result ~ .,
data = df_balanced_over,
method = "class")
tree.under <- rpart(Result ~ .,
data = df_balanced_under,
method = "class")
tree.both <- rpart(Result ~ .,
data = df_balanced_both,
method = "class")
pred.tree.rose <- predict(tree.rose, newdata=test, type = "class")
caret::confusionMatrix(pred.tree.rose, test$Result)
## Warning in confusionMatrix.default(pred.tree.rose, test$Result): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 546 7988
## Unsuccessful 177 23846
##
## Accuracy : 0.7492
## 95% CI : (0.7445, 0.7539)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0803
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.75519
## Specificity : 0.74907
## Pos Pred Value : 0.06398
## Neg Pred Value : 0.99263
## Prevalence : 0.02221
## Detection Rate : 0.01677
## Detection Prevalence : 0.26212
## Balanced Accuracy : 0.75213
##
## 'Positive' Class : Accepted
##
#plotcp(tree.rose)
pred.tree.over <- predict(tree.over, newdata=test, type = "class")
caret::confusionMatrix(pred.tree.over, test$Result)
## Warning in confusionMatrix.default(pred.tree.over, test$Result): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 564 9030
## Unsuccessful 159 22804
##
## Accuracy : 0.7178
## 95% CI : (0.7128, 0.7226)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.071
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.78008
## Specificity : 0.71634
## Pos Pred Value : 0.05879
## Neg Pred Value : 0.99308
## Prevalence : 0.02221
## Detection Rate : 0.01732
## Detection Prevalence : 0.29468
## Balanced Accuracy : 0.74821
##
## 'Positive' Class : Accepted
##
#plotcp(tree.over)
pred.tree.under <- predict(tree.under, newdata=test, type = "class")
caret::confusionMatrix(pred.tree.under, test$Result)
## Warning in confusionMatrix.default(pred.tree.under, test$Result): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 535 7592
## Unsuccessful 188 24242
##
## Accuracy : 0.761
## 95% CI : (0.7564, 0.7657)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0835
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.73997
## Specificity : 0.76151
## Pos Pred Value : 0.06583
## Neg Pred Value : 0.99230
## Prevalence : 0.02221
## Detection Rate : 0.01643
## Detection Prevalence : 0.24962
## Balanced Accuracy : 0.75074
##
## 'Positive' Class : Accepted
##
#plotcp(tree.under)
pred.tree.both <- predict(tree.both, newdata=test, type = "class")
caret::confusionMatrix(pred.tree.both, test$Result)
## Warning in confusionMatrix.default(pred.tree.both, test$Result): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 562 8958
## Unsuccessful 161 22876
##
## Accuracy : 0.7199
## 95% CI : (0.715, 0.7248)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0714
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.77732
## Specificity : 0.71860
## Pos Pred Value : 0.05903
## Neg Pred Value : 0.99301
## Prevalence : 0.02221
## Detection Rate : 0.01726
## Detection Prevalence : 0.29241
## Balanced Accuracy : 0.74796
##
## 'Positive' Class : Accepted
##
#plotcp(tree.both)
rpart.plot(tree.under)
Notes on Acceptance Rates
Before June 17th, probability of acceptance is 80%
After October 5th, probability of acceptance rises to 85%
Between October 5th and June 17th, acceptance rate is 27%
Notes on Preferred Division and Group Size
We find that maximum requested group size did not play an important role in estimating acceptance rates
Preferred division, however, is important. If you are wanting to be selected for either the Core Zone or Colchuck zone, your acceptance probability is heavily dependent on time of year.
Note about trees:
Each node shows
the predicted class (weather conditions met or not),
the predicted probability of weather conditions being met,
the percentage of observations in the node
set.seed(777)
#Use 70% of dataset as training set and remaining 30% as testing set
sample <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.7,0.3))
train <- df[sample, ]
test <- df[!sample, ]
tree <- rpart(nwTrekConditions ~ Day, train, method = "class")
rpart.plot(tree)
# Get test set predictions
tree.pred <- predict(tree, test, type = "class")
# Build confusion matrix with caret package
caret::confusionMatrix(as.factor(tree.pred), as.factor(test$nwTrekConditions))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12271 69
## 1 371 19846
##
## Accuracy : 0.9865
## 95% CI : (0.9852, 0.9877)
## No Information Rate : 0.6117
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9714
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9707
## Specificity : 0.9965
## Pos Pred Value : 0.9944
## Neg Pred Value : 0.9816
## Prevalence : 0.3883
## Detection Rate : 0.3769
## Detection Prevalence : 0.3790
## Balanced Accuracy : 0.9836
##
## 'Positive' Class : 0
##
# Retrieve complexity parameter or CP
printcp(tree)
##
## Classification tree:
## rpart(formula = nwTrekConditions ~ Day, data = train, method = "class")
##
## Variables actually used in tree construction:
## [1] Day
##
## Root node error: 29723/76085 = 0.39066
##
## n= 76085
##
## CP nsplit rel error xerror xstd
## 1 0.415705 0 1.000000 1.000000 0.0045278
## 2 0.249066 1 0.584295 0.584295 0.0038950
## 3 0.030532 2 0.335229 0.335229 0.0031307
## 4 0.028110 4 0.274165 0.274165 0.0028699
## 5 0.027083 6 0.217946 0.217946 0.0025900
## 6 0.022054 8 0.163779 0.163779 0.0022710
## 7 0.015644 10 0.119672 0.119672 0.0019591
## 8 0.014316 12 0.088383 0.088383 0.0016944
## 9 0.012566 14 0.059752 0.059752 0.0014012
## 10 0.010000 16 0.034620 0.034620 0.0010719
plotcp(tree)
Notes
The plotcp() function provides the cross validated error rate for various complexity parameter thresholds.
Ideally, you would expect the error to be very high for high values of cp, which will then gradually decrease before increasing again or flattening out (bias-variance trade-off).
Observed that the complexity parameter can remain at the defaul of 0.01 as adjusting it to 0.011 produces no noticeable effect on the model or the predictions
coreZoneDat <- df %>%
filter(Preferred.Division == "Core Enchantment Zone")
set.seed(777)
#Use 70% of dataset as training set and remaining 30% as testing set
sampleCore <- sample(c(TRUE, FALSE), nrow(coreZoneDat), replace=TRUE, prob=c(0.7,0.3))
trainCore <- coreZoneDat[sampleCore, ]
testCore <- coreZoneDat[!sampleCore, ]
treeCore <- rpart(nwTrekConditions ~ Day, trainCore, method = "class")
rpart.plot(treeCore)
# Get test set predictions
treeCore.pred <- predict(treeCore, testCore, type = "class")
# Build confusion matrix with caret package
caret::confusionMatrix(as.factor(treeCore.pred), as.factor(testCore$nwTrekConditions))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7951 33
## 1 272 12883
##
## Accuracy : 0.9856
## 95% CI : (0.9839, 0.9871)
## No Information Rate : 0.611
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9695
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9669
## Specificity : 0.9974
## Pos Pred Value : 0.9959
## Neg Pred Value : 0.9793
## Prevalence : 0.3890
## Detection Rate : 0.3761
## Detection Prevalence : 0.3777
## Balanced Accuracy : 0.9822
##
## 'Positive' Class : 0
##
# Retrieve complexity parameter or CP
printcp(treeCore)
##
## Classification tree:
## rpart(formula = nwTrekConditions ~ Day, data = trainCore, method = "class")
##
## Variables actually used in tree construction:
## [1] Day
##
## Root node error: 19350/49294 = 0.39254
##
## n= 49294
##
## CP nsplit rel error xerror xstd
## 1 0.437881 0 1.000000 1.000000 0.0056030
## 2 0.230853 1 0.562119 0.562119 0.0047582
## 3 0.031473 2 0.331266 0.331266 0.0038592
## 4 0.027287 4 0.268320 0.268320 0.0035222
## 5 0.021860 8 0.159173 0.159173 0.0027771
## 6 0.014625 10 0.115452 0.115452 0.0023867
## 7 0.013928 12 0.086202 0.086202 0.0020746
## 8 0.012429 14 0.058346 0.058346 0.0017165
## 9 0.010000 16 0.033488 0.033488 0.0013069
plotcp(treeCore)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# fit random forest model
rf_orig <- randomForest(
formula = Result ~ .,
data = train
)
rf_orig
##
## Call:
## randomForest(formula = Result ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## Accepted Unsuccessful class.error
## Accepted 1722 0 0
## Unsuccessful 0 74363 0
pred.forest.orig <- predict(rf_orig, newdata = test)
caret::confusionMatrix(pred.forest.orig, test$Result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 723 0
## Unsuccessful 0 31834
##
## Accuracy : 1
## 95% CI : (0.9999, 1)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.00000
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 1.00000
## Prevalence : 0.02221
## Detection Rate : 0.02221
## Detection Prevalence : 0.02221
## Balanced Accuracy : 1.00000
##
## 'Positive' Class : Accepted
##
library(randomForest)
set.seed(1)
# fit random forest model
rf_rose <- randomForest(
formula = Result ~ .,
data = df.rose
)
rf_rose
##
## Call:
## randomForest(formula = Result ~ ., data = df.rose)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 24.59%
## Confusion matrix:
## Unsuccessful Accepted class.error
## Unsuccessful 28679 9231 0.2434978
## Accepted 9478 28697 0.2482777
which.min(rf_rose$err.rate)
## [1] 502
# Find RMSE
sqrt(rf_rose$err.rate[which.min(rf_rose$err.rate)])
## [1] 0.463506
plot(rf_rose)
varImpPlot(rf_rose)
Notes
rf_rose_tuned <- tuneRF(
x=df.rose[,c(2,3,4)], #define predictor variables
y=df.rose$Result, #define response variable
ntreeTry=500,
mtryStart=3,
stepFactor=1.5,
improve=0.01,
trace=FALSE #don't show real-time progress
)
## 0.00738432 0.01
pred.forest.rose <- predict(rf_rose, newdata = test)
caret::confusionMatrix(pred.forest.rose, test$Result)
## Warning in confusionMatrix.default(pred.forest.rose, test$Result): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 532 7158
## Unsuccessful 191 24676
##
## Accuracy : 0.7743
## 95% CI : (0.7697, 0.7788)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0895
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.73582
## Specificity : 0.77515
## Pos Pred Value : 0.06918
## Neg Pred Value : 0.99232
## Prevalence : 0.02221
## Detection Rate : 0.01634
## Detection Prevalence : 0.23620
## Balanced Accuracy : 0.75548
##
## 'Positive' Class : Accepted
##
# fit random forest model
rf_both <- randomForest(
formula = Result ~ .,
data = df_balanced_both
)
rf_both
##
## Call:
## randomForest(formula = Result ~ ., data = df_balanced_both)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 24.22%
## Confusion matrix:
## Unsuccessful Accepted class.error
## Unsuccessful 28968 8942 0.2358744
## Accepted 9487 28688 0.2485134
which.min(rf_both$err.rate)
## [1] 502
# Find RMSE
sqrt(rf_both$err.rate[which.min(rf_both$err.rate)])
## [1] 0.4570616
plot(rf_both)
varImpPlot(rf_both)
rf_both_tuned <- tuneRF(
x=df_balanced_both[,c(2,3,4)], #define predictor variables
y=df_balanced_both$Result, #define response variable
ntreeTry=500,
mtryStart=3,
stepFactor=1.5,
improve=0.01,
trace=FALSE #don't show real-time progress
)
## -0.1599021 0.01
pred.forest.both <- predict(rf_both, newdata = test)
caret::confusionMatrix(pred.forest.both, test$Result)
## Warning in confusionMatrix.default(pred.forest.both, test$Result): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 528 7533
## Unsuccessful 195 24301
##
## Accuracy : 0.7626
## 95% CI : (0.758, 0.7672)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0828
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.73029
## Specificity : 0.76337
## Pos Pred Value : 0.06550
## Neg Pred Value : 0.99204
## Prevalence : 0.02221
## Detection Rate : 0.01622
## Detection Prevalence : 0.24760
## Balanced Accuracy : 0.74683
##
## 'Positive' Class : Accepted
##
# fit random forest model
rf_under <- randomForest(
formula = Result ~ .,
data = df_balanced_under
)
rf_under
##
## Call:
## randomForest(formula = Result ~ ., data = df_balanced_under)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 25.26%
## Confusion matrix:
## Unsuccessful Accepted class.error
## Unsuccessful 1309 413 0.2398374
## Accepted 457 1265 0.2653891
which.min(rf_under$err.rate)
## [1] 503
# Find RMSE
sqrt(rf_under$err.rate[which.min(rf_under$err.rate)])
## [1] 0.4834208
plot(rf_under)
varImpPlot(rf_under)
rf_under_tuned <- tuneRF(
x=df_balanced_under[,c(2,3,4)], #define predictor variables
y=df_balanced_under$Result, #define response variable
ntreeTry=500,
mtryStart=3,
stepFactor=1.5,
improve=0.01,
trace=FALSE #don't show real-time progress
)
## 0.08115672 0.01
pred.forest.under <- predict(rf_under, newdata = test)
caret::confusionMatrix(pred.forest.under, test$Result)
## Warning in confusionMatrix.default(pred.forest.under, test$Result): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 541 7826
## Unsuccessful 182 24008
##
## Accuracy : 0.754
## 95% CI : (0.7493, 0.7587)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0815
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.74827
## Specificity : 0.75416
## Pos Pred Value : 0.06466
## Neg Pred Value : 0.99248
## Prevalence : 0.02221
## Detection Rate : 0.01662
## Detection Prevalence : 0.25700
## Balanced Accuracy : 0.75122
##
## 'Positive' Class : Accepted
##
# fit random forest model
rf_over <- randomForest(
formula = Result ~ .,
data = df_balanced_over
)
rf_over
##
## Call:
## randomForest(formula = Result ~ ., data = df_balanced_over)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 24.48%
## Confusion matrix:
## Unsuccessful Accepted class.error
## Unsuccessful 56422 17941 0.2412625
## Accepted 18470 55893 0.2483762
which.min(rf_over$err.rate)
## [1] 502
# Find RMSE
sqrt(rf_over$err.rate[which.min(rf_over$err.rate)])
## [1] 0.4323203
varImpPlot(rf_over)
plot(rf_over)
rf_over_tuned <- tuneRF(
x=df_balanced_over[,c(2,3,4)], #define predictor variables
y=df_balanced_over$Result, #define response variable
ntreeTry=500,
mtryStart=3,
stepFactor=1.5,
improve=0.01,
trace=FALSE #don't show real-time progress
)
## -0.1670052 0.01
pred.forest.over <- predict(rf_over, newdata = test)
caret::confusionMatrix(pred.forest.over, test$Result)
## Warning in confusionMatrix.default(pred.forest.over, test$Result): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accepted Unsuccessful
## Accepted 534 7622
## Unsuccessful 189 24212
##
## Accuracy : 0.7601
## 95% CI : (0.7554, 0.7647)
## No Information Rate : 0.9778
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0829
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.73859
## Specificity : 0.76057
## Pos Pred Value : 0.06547
## Neg Pred Value : 0.99225
## Prevalence : 0.02221
## Detection Rate : 0.01640
## Detection Prevalence : 0.25051
## Balanced Accuracy : 0.74958
##
## 'Positive' Class : Accepted
##
#define new observation
new <- data.frame(Day=178, Preferred.Division="Snow Zone", Maximum.Requested.Group.Size=5)
#use fitted bagged model to predict Ozone value of new observation
predict(tree.under, newdata=new)
## Unsuccessful Accepted
## 1 0.2556054 0.7443946